from google.cloud import storage
import pandas as pd
import io
import os
import gzip
import plotly.express as px
---------------------------------------------------------------------------
ModuleNotFoundError Traceback (most recent call last)
Cell In[1], line 1
----> 1 from google.cloud import storage
2 import pandas as pd
3 import io
ModuleNotFoundError: No module named 'google'
service_account_id = 'elijahsandler@net-data-viz-handbook.iam.gserviceaccount.com'
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'C:\\Users\\elija\\Documents\\24f-coop\\net-data-viz-handbook-fe2c5531555d.json'
# Initialize a GCS client
client = storage.Client()
print('1 ')
# Specify your bucket name and the specific .csv.gz file you want
bucket_name = 'gs_net-data-viz-handbook'
file_name = 'sample/sample_SIR_0_countries_incidence_daily.csv.gz' # Update this to the specific file name
meta_file = 'sample/sample_SIR_0_meta.csv.gz'
print('2 ')
# Get the bucket and blob
bucket = client.get_bucket(bucket_name)
blob = bucket.blob(file_name)
metablob = bucket.blob(meta_file)
print('3 ')
# Download the .csv.gz file as bytes
compressed_content = blob.download_as_bytes()
print('4 ')
# Decompress the .csv.gz content
with gzip.GzipFile(fileobj=io.BytesIO(compressed_content)) as gz:
# Read the decompressed content into a pandas DataFrame
df = pd.read_csv(gz)
# Download the .csv.gz file as bytes
compressed_content = metablob.download_as_bytes()
print('5 ')
# Decompress the .csv.gz content
with gzip.GzipFile(fileobj=io.BytesIO(compressed_content)) as gz:
# Read the decompressed content into a pandas DataFrame
df_meta = pd.read_csv(gz)
1
2
3
4
5
df_meta.iloc[4, :]
run_id 5
starting_date 2009-02-18
last_day_sims 2010-02-17
n_runs 100
fraction_susceptible [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...
...
npi_multipliers_rule min
frac_pops NaN
ref_compartments [""]
avg_time_valid_runs 1877.09113
avg_time_all_runs 1877.09113
Name: 4, Length: 85, dtype: object
df_sum = df.drop(['t'], axis=1).groupby(['date', 'country_id', 'run_id']).sum()
# get only 1 country's data
country = 0
df_country = df_sum.loc[(slice(None), country), :]
df_country = df_country.droplevel('country_id').T.sum().reset_index()
# pivoting data. god what a good function.
df_pivot = df_country.reset_index().pivot(index='date', columns='run_id', values=0).fillna(0)
# zero-indexing run_id because we aren't barbarians
df_pivot.columns = df_pivot.columns - 1
df_pivot
| run_id | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| date | |||||||||||||||||||||
| 2009-02-17 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2009-02-18 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2009-02-19 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2009-02-20 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2009-02-21 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2010-02-13 | 54 | 0 | 11 | 8 | 12 | 0 | 31 | 33 | 17 | 19 | ... | 0 | 29 | 99 | 25 | 14 | 0 | 0 | 0 | 0 | 25 |
| 2010-02-14 | 47 | 0 | 8 | 14 | 10 | 0 | 18 | 29 | 16 | 10 | ... | 0 | 33 | 70 | 9 | 17 | 0 | 0 | 0 | 0 | 11 |
| 2010-02-15 | 49 | 0 | 9 | 17 | 24 | 0 | 27 | 29 | 23 | 14 | ... | 0 | 33 | 77 | 15 | 14 | 0 | 0 | 0 | 0 | 21 |
| 2010-02-16 | 56 | 0 | 4 | 22 | 13 | 0 | 16 | 31 | 10 | 11 | ... | 0 | 28 | 60 | 21 | 21 | 0 | 0 | 0 | 0 | 16 |
| 2010-02-17 | 42 | 0 | 6 | 22 | 10 | 0 | 19 | 30 | 18 | 14 | ... | 0 | 24 | 54 | 15 | 19 | 0 | 0 | 0 | 0 | 15 |
366 rows × 100 columns
fig = px.line(df_pivot,
labels={
"value": "Cases?",
"date": "Date",
},
title="2009/10 Incidence Spaghetti Plot")
fig.show()
C:\Users\elija\anaconda3\Lib\site-packages\plotly\express\_core.py:1222: PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
df_output[col_name] = to_unindexed_series(df_input[argument])
And just for fun…#
It’s not really for fun because, you know, it’s my job, but whatever. It’s fun too.
from curvestat import CurveBoxPlot
from curvestat import LoadRisk
from time import time
import numpy as np
import plotly.graph_objs as go
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import euclidean_distances
from matplotlib.colors import to_rgba
from collections import defaultdict
# Array of time steps
time_arr = np.arange(0,len(df_pivot),1)
time_arr;
start = time()
Boxplot = CurveBoxPlot(curves=df_pivot,sample_curves=10,sample_repititions=40,time=time_arr)
# Choose ranking
rank_allornothing=Boxplot.rank_allornothing()
rank_max = Boxplot.rank_max()
# Get envelope with this choice of ranking
boundaries = Boxplot.get_boundaries(rank_max,percentiles=[50,90])
# ... HEATMAP :
# First, define which curves we want the heatmap for
heatmapcurves = list(boundaries['curve-based'][50]['curves'])
# Then find the heatmap
heatmap_50 = Boxplot.get_peakheatmap(heatmap_curves=heatmapcurves)
# Plot results
Boxplot.plot_everything()
end = time()
diff = end - start
print(f"Ran {len(df_pivot)} curves in {round(diff, 2)} seconds")
Ran 366 curves in 1.94 seconds
start = time()
# unfortunately, pairwise comparisons are O(n^2)
curve_diff = dict()
for first_curve in df_pivot.columns:
for second_curve in df_pivot.columns[first_curve+1:]:
curve_diff[(first_curve, second_curve)] = \
((df_pivot[first_curve] - df_pivot[second_curve])**1).abs().sum()
end = time()
print(f'pairwise compared {len(df_pivot)} curves in {round(end-start, 4)} seconds')
pairwise compared 366 curves in 1.4989 seconds
# Get all unique elements
elements = sorted(set(i for pair in curve_diff.keys() for i in pair))
n = len(elements)
# Create the distance matrix
distance_matrix = np.zeros((n, n))
for (i, j), score in curve_diff.items():
distance_matrix[i, j] = score
distance_matrix[j, i] = score # Because the matrix is symmetric
# Convert distance matrix to similarity matrix
similarity_matrix = np.max(distance_matrix) - distance_matrix
# Fit K-Means clustering
k = 3 # Change this to the number of desired clusters
kmeans = KMeans(n_clusters=k)
labels = kmeans.fit_predict(similarity_matrix)
C:\Users\elija\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1412: FutureWarning:
The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
C:\Users\elija\anaconda3\Lib\site-packages\sklearn\cluster\_kmeans.py:1436: UserWarning:
KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.
colors = ['red', 'blue', 'green', 'gray', 'cyan', 'yellow', 'gray']
cmap = dict(zip(range(len(colors)), colors))
traces = []
# Plot each curve with its respective color based on the group
for column, group in zip(df_pivot.columns, labels):
trace = go.Scatter(
x=df_pivot.index,
y=df_pivot[column],
mode='lines',
name=f'Curve {column}',
line=dict(color=cmap[group])
)
traces.append(trace)
fig = go.Figure(traces)
# Update layout
fig.update_layout(
title='Grouped Incidence',
xaxis_title='Date',
yaxis_title='Cases? ',
showlegend=False
)
# Show the plot
fig.show()
di = defaultdict(lambda: defaultdict())
for label in np.unique(labels):
di[label]['data'] = df_pivot[[i for i in df_pivot.columns if labels[i] == label]]
# create boxplot instance, add curves
Boxplot = CurveBoxPlot(curves=di[label]['data'],sample_curves=10,sample_repititions=500,time=time_arr)
# rank curves
rank_allornothing=Boxplot.rank_allornothing()
boundaries = Boxplot.get_boundaries(rank_allornothing,percentiles=[0, 50,90])
di[label]['boundaries'] = boundaries
date_arr = df_pivot.index
date_arr;
This plot uses AUC for grouping curves, then curvestat for finding the boundaries for the median, middle quartiles, and middle 90%.
fig = go.Figure()
for group in di:
# Lower 5%
fig.add_trace(go.Scatter(
name=f'Lower 5% of {group}',
x=date_arr,
y=di[group]['boundaries']['curve-based'][90]['min-boundary'],
marker=dict(color="#444"),
line=dict(width=0),
mode='lines',
showlegend=False,
legendgroup=str(group) # Assign to legend group
))
# Upper 5%
fig.add_trace(go.Scatter(
name=f'Upper 5% of {group}',
x=date_arr,
y=di[group]['boundaries']['curve-based'][90]['max-boundary'],
mode='lines',
marker=dict(color="#444"),
line=dict(width=0),
fillcolor='rgba' + str(to_rgba(cmap[group], alpha=0.1)),
fill='tonexty',
showlegend=False,
legendgroup=str(group) # Assign to legend group
))
# Lower Quartile
fig.add_trace(go.Scatter(
name=f'Lower Quartile of {group}',
x=date_arr,
y=di[group]['boundaries']['curve-based'][50]['min-boundary'],
marker=dict(color="#444"),
line=dict(width=0),
mode='lines',
showlegend=False,
legendgroup=str(group) # Assign to legend group
))
# Upper Quartile
fig.add_trace(go.Scatter(
name=f'Upper Quartile of {group}',
x=date_arr,
y=di[group]['boundaries']['curve-based'][50]['max-boundary'],
mode='lines',
marker=dict(color="#444"),
line=dict(width=0),
fillcolor='rgba' + str(to_rgba(cmap[group], alpha=0.2)),
fill='tonexty',
showlegend=False,
legendgroup=str(group) # Assign to legend group
))
# Most Central Curve
fig.add_trace(go.Scatter(
name=f'Group {group}',
x=date_arr,
y=df_pivot[di[group]['boundaries']['curve-based'][0]['curves'][0]],
marker=dict(color=cmap[group]),
line=dict(width=1),
mode='lines',
showlegend=True, # Show legend for the central curve
legendgroup=str(group) # Assign to legend group
))
# Update layout to show the legend
fig.update_layout(
title='Middle 90% Curves for Each Group',
xaxis_title='Date',
yaxis_title='Cases',
showlegend=True # Enable legend
)
fig.show()
pd.DataFrame(curve_diff, index=['Distance']).T
| Distance | ||
|---|---|---|
| 0 | 1 | 69902 |
| 2 | 63335 | |
| 3 | 62744 | |
| 4 | 51459 | |
| 5 | 69902 | |
| ... | ... | ... |
| 96 | 98 | 0 |
| 99 | 76178 | |
| 97 | 98 | 0 |
| 99 | 76178 | |
| 98 | 99 | 76178 |
4950 rows × 1 columns
centrality = {curve: 0 for curve in set(curve for pair in curve_diff.keys() for curve in pair)}
# Compute centrality scores
for (curve1, curve2), diff in curve_diff.items():
centrality[curve1] += diff
centrality[curve2] += diff
# Convert centrality scores to a DataFrame for easy sorting
centrality_df = pd.DataFrame(list(centrality.items()), columns=['Curve', 'Centrality'])
centrality_df['Centrality'] = 1 - (centrality_df['Centrality']-centrality_df['Centrality'].min())/(centrality_df['Centrality'].max()-centrality_df['Centrality'].min())
centrality_df['Group'] = labels
centrality_df.head()
| Curve | Centrality | Group | |
|---|---|---|---|
| 0 | 0 | 0.792745 | 0 |
| 1 | 1 | 0.449300 | 2 |
| 2 | 2 | 0.628479 | 1 |
| 3 | 3 | 0.646675 | 1 |
| 4 | 4 | 0.832616 | 1 |
# Rank curves from most to least central, reminder that this is done by summing the area between all curves
ranked_curves = centrality_df.sort_values(by='Centrality', ascending=False).reset_index(drop=True)
ranked_curves
| Curve | Centrality | Group | |
|---|---|---|---|
| 0 | 83 | 1.000000 | 1 |
| 1 | 32 | 0.997673 | 1 |
| 2 | 9 | 0.992977 | 1 |
| 3 | 48 | 0.991864 | 1 |
| 4 | 86 | 0.989714 | 1 |
| ... | ... | ... | ... |
| 95 | 92 | 0.363995 | 0 |
| 96 | 79 | 0.331418 | 0 |
| 97 | 65 | 0.314794 | 0 |
| 98 | 27 | 0.066969 | 1 |
| 99 | 71 | 0.000000 | 0 |
100 rows × 3 columns
px.histogram(x=ranked_curves['Centrality'], color=ranked_curves['Group'])
df = df_pivot.T.quantile([0, .10, .25, .75, .90, 1]).T
df['Median'] = df_pivot.T.median()
df['Mean'] = df_pivot.T.mean()
px.scatter(df, title='The Problem with Curve Box Plots')
This graph illustrates the issue we face. There are times where the 75th percentile points look as though they are well above the 90th percentile points, which obviously isn’t right. curvestat gets around this problem by basically just widening the confidence interval: now the 75% interval looks like the 90% interval. That doesn’t help us though. At that point, we’re deviating from the conventional use of statistics, and while I’m a big fan of breaking conventions, it is counter-productive with regards to our goal of making these graphs more legible.
curvestat’s use of sampling also means that their package is not strictly deterministic, which is a big issue as well. This is rectified with my “area under the curve” method.
# one week rolling maximum for each run
df_roll = df_pivot.rolling(7).max().dropna()
# same as above
df = df_roll.T.quantile([0, .10, .25, .75, .90, 1]).T
df['Median'] = df_roll.T.median()
df['Mean'] = df_roll.T.mean()
px.scatter(df, title='The Problem with Curve Box Plots: Rollling Average')
(df_pivot.max() < 1200).astype(int).mean()
0.48
# if you get rid of outliers then uhhhhh it looks great
((df_pivot.rolling(7).median().dropna().max()) > 1200).mean()
0.28
This seems to suggest that ~75% of the peaks are below 1200. Alas, 48% of them are.
How is that possible? Well, the graph isn’t suggesting that 90% of the peaks are below 1200. In any way. It’s suggesting that at time Nov 19, 2009, 75% of curves will be below 1200, and at every other time, at least 75% of curves will be lower than that. Which is true! If you have a graph where the axis are date and value, then any given point on the graph represents the coincidence of the date and value: not the value and every date.
px.histogram(df_pivot.max(), nbins=20)
Putting it all together#
Last shot: Let’s group the curves using AUC, then calculate their centrality relative to only the curves in their group so we can cut curvestat’s BS out altogether and hopefully nail this.
start = time()
# unfortunately, pairwise comparisons are O(n^2)
curve_diff = dict()
for first_curve in df_pivot.columns:
for second_curve in df_pivot.columns[first_curve+1:]:
curve_diff[(first_curve, second_curve)] = \
((df_pivot[first_curve] - df_pivot[second_curve])**1).abs().sum()
# Get all unique elements
elements = sorted(set(i for pair in curve_diff.keys() for i in pair))
n = len(elements)
# Create the distance matrix
distance_matrix = np.zeros((n, n))
for (i, j), score in curve_diff.items():
distance_matrix[i, j] = score
distance_matrix[j, i] = score # Because the matrix is symmetric
# Convert distance matrix to similarity matrix
similarity_matrix = np.max(distance_matrix) - distance_matrix
# Fit K-Means clustering
k = 3 # Change this to the number of desired clusters
kmeans = KMeans(n_clusters=k)
labels = kmeans.fit_predict(similarity_matrix)
The curves are now grouped via AUC.
di[0]['data'].median()
run_id
2 9.5
3 8.5
4 9.5
6 5.0
7 4.0
8 14.0
9 9.0
11 16.5
13 12.5
17 4.5
18 19.0
20 19.0
24 6.5
25 4.0
27 18.0
28 16.0
29 0.0
30 5.0
32 2.5
34 13.5
35 5.0
36 28.0
38 7.0
40 0.0
41 16.0
43 5.0
44 9.0
46 5.0
47 1.0
48 10.0
49 2.0
51 25.0
52 14.0
54 20.0
55 13.5
56 17.0
59 17.0
61 3.5
62 13.0
66 1.0
67 5.5
68 1.0
69 35.5
72 32.0
73 22.5
76 4.5
78 7.5
80 4.5
83 5.0
84 21.0
85 1.0
86 3.0
91 37.0
93 6.0
94 10.5
99 11.0
dtype: float64
centrality = {curve: 0 for curve in set(curve for pair in curve_diff.keys() for curve in pair)}
# Compute centrality scores
for (curve1, curve2), diff in curve_diff.items():
if labels[curve1] == labels[curve2]:
centrality[curve1] += diff
centrality[curve2] += diff
# Convert centrality scores to a DataFrame for easy sorting
centrality_df = pd.DataFrame(list(centrality.items()), columns=['Curve', 'Distance'])
# centrality_df['Centrality'] = 1 - (centrality_df['Centrality']-centrality_df['Centrality'].min())/(centrality_df['Centrality'].max()-centrality_df['Centrality'].min())
centrality_df['Group'] = labels
centrality_df
| Curve | Distance | Group | |
|---|---|---|---|
| 0 | 0 | 375660 | 2 |
| 1 | 1 | 0 | 1 |
| 2 | 2 | 1721923 | 0 |
| 3 | 3 | 1560783 | 0 |
| 4 | 4 | 1247105 | 0 |
| ... | ... | ... | ... |
| 95 | 95 | 0 | 1 |
| 96 | 96 | 0 | 1 |
| 97 | 97 | 0 | 1 |
| 98 | 98 | 0 | 1 |
| 99 | 99 | 1393709 | 0 |
100 rows × 3 columns
def get_central_curves(percentile, group, df=centrality_df):
""" gets `percentile` most central curves from given group"""
df = df[df['Group'] == group]
df.sort_values('Distance', inplace=True)
ls = list(df.reset_index().loc[:max(0, ((percentile/100) * len(df) - 1)), 'Curve'])
# return df_pivot[ls]
return ls
get_central_curves(0, 0)
C:\Users\elija\AppData\Local\Temp\ipykernel_29600\422604665.py:5: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
| run_id | 54 |
|---|---|
| date | |
| 2009-02-17 | 0 |
| 2009-02-18 | 0 |
| 2009-02-19 | 0 |
| 2009-02-20 | 0 |
| 2009-02-21 | 0 |
| ... | ... |
| 2010-02-13 | 8 |
| 2010-02-14 | 11 |
| 2010-02-15 | 7 |
| 2010-02-16 | 8 |
| 2010-02-17 | 12 |
366 rows × 1 columns
end = time()
print(f'Ran process on {len(df_pivot)} curves in {round(end-start, 4)} seconds')
Creating a heatmap#
from scipy.stats import gaussian_kde
px.scatter(df_pivot, color=['blue']*len(df_pivot))
C:\Users\elija\anaconda3\Lib\site-packages\plotly\express\_core.py:1222: PerformanceWarning:
DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
C:\Users\elija\anaconda3\Lib\site-packages\plotly\express\_core.py:1251: PerformanceWarning:
DataFrame is highly fragmented. This is usually the result of calling `frame.insert` many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
dt = dict(zip(date_arr, time_arr))
df_country.head(1)
| date | run_id | 0 | |
|---|---|---|---|
| 0 | 2009-02-17 | 1 | 0 |
df_country.groupby('date').sum()[df_country.groupby('date').sum()[0] != 0].index
Index(['2009-04-26', '2009-05-09', '2009-05-23', '2009-06-09', '2009-06-14',
'2009-06-15', '2009-06-16', '2009-06-17', '2009-06-18', '2009-06-19',
...
'2010-02-08', '2010-02-09', '2010-02-10', '2010-02-11', '2010-02-12',
'2010-02-13', '2010-02-14', '2010-02-15', '2010-02-16', '2010-02-17'],
dtype='object', name='date', length=253)
filtered_df_country = df_country[df_country['date'].isin(df_country.groupby('date').sum()[df_country.groupby('date').sum()[0] != 0].index)]
peak_time = filtered_df_country['date'].map(dt).values
peak_value = filtered_df_country[0].values
# Calculate point density
xy = np.vstack([peak_time,peak_value])
peak_density = gaussian_kde(xy)(xy)
# Sort the points by density, so that the densest points are plotted last
idx = peak_density.argsort()
peak_heatmap = {'peak_time':peak_time[idx], 'peak_value':peak_value[idx],'peak_density': peak_density[idx]}
pd.DataFrame(peak_heatmap)
| peak_time | peak_value | peak_density | |
|---|---|---|---|
| 0 | 269 | 1616 | 2.695207e-08 |
| 1 | 268 | 1603 | 2.999189e-08 |
| 2 | 267 | 1578 | 3.638725e-08 |
| 3 | 275 | 1525 | 7.821058e-08 |
| 4 | 311 | 1474 | 8.430531e-08 |
| ... | ... | ... | ... |
| 25295 | 158 | 2 | 2.496828e-05 |
| 25296 | 159 | 3 | 2.496927e-05 |
| 25297 | 159 | 3 | 2.496927e-05 |
| 25298 | 159 | 3 | 2.496927e-05 |
| 25299 | 159 | 3 | 2.496927e-05 |
25300 rows × 3 columns
px.histogram(peak_heatmap['peak_density']**.0000002)
import matplotlib.pyplot as plt
plt.scatter(x=peak_heatmap['peak_time'], y=peak_heatmap['peak_value'], c=peak_heatmap['peak_density']**.0000010)
<matplotlib.collections.PathCollection at 0x24def2a4190>
len(pd.DataFrame(peak_heatmap))
25300